import numpy as np # kubear akgevra
import sqlite3
import matplotlib.pylab as plt
import datetime as DT
import seaborn as sns
np.set_printoptions(precision=15)
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
#### import the PCA library from scikit learn library
from sklearn.decomposition import PCA
from mpl_toolkits.mplot3d import Axes3D
%matplotlib inline
# Normalizing (scaling) the data is VERY important - indeed can be important to many machine learning algorithms.
# take our original features and scale them so that they all have zero mean and unit variance
from sklearn import preprocessing
## pandas actually has a command to read_sql or read_sql_query and return a pandas.DataFrame
## coerce_float=True argument to force float data type
## Need to FIRST connect to the database by creating a connection object called conn.
conn = sqlite3.connect('database.sqlite')
# create a cursor object using the conn object method
# The cursor object has methods for accessing the data
# c = conn.cursor() # this is not needed for the pd.read_sql IO tool
# Get the database table list from information in the sqlite_master table
# Follow convention to type SQL commands in all caps
# preview all tables in the database
print ('======')
print ('Tables in the database')
# Set the execute SQL command, Fetch and print all table names and info , return a pandas DataFrame
df_tables = pd.read_sql("""SELECT * FROM sqlite_master WHERE type='table';""", conn)
print('df_tables shape: ', df_tables.shape)
print(df_tables)
print ('======')
print('Player_Attributes table:')
print(df_tables.sql[1]) # get sql that CREATE the Player_Atrribtues table
print('Player table:')
print(df_tables.sql[2]) # get sql that CREATE the Player table
print ('======')
print ('Player table')
df_Player = pd.read_sql("""SELECT * FROM Player """, conn)
print('df_Player.shape:', df_Player.shape)
print(df_Player.columns)
print(df_Player.head())
print ('======')
print ('Player_Attributes table')
df_Player_Attributes = pd.read_sql("""SELECT * FROM Player_Attributes""", conn) #
print('df_Player_Attributes.shape:', df_Player_Attributes.shape)
print(df_Player_Attributes.columns)
print(df_Player_Attributes.head())
print ('======')
# acquire data from database using pd.read_sql_query(sql, , ,)
# build SQL to SELECT all columns from both Player and Player_Attributes tables
# for rows reocrds w/ matching player_fifa_api_id
sql="SELECT * FROM Player INNER JOIN Player_Attributes ON Player.player_fifa_api_id=Player_Attributes.player_fifa_api_id;"
df_all_col=pd.read_sql_query(sql, conn, coerce_float=True, params=None, parse_dates=['birthday','date'], chunksize=None)
# calculate age of player at the time attributes were collected
df_all_col['age'] = (df_all_col.date - df_all_col.birthday).astype('timedelta64[Y]')
#Tally total score per player attribute category
df_all_col['total_attack'] = df_all_col.crossing + df_all_col.finishing + df_all_col.heading_accuracy + \
df_all_col.short_passing + df_all_col.volleys
df_all_col['total_skill'] = df_all_col.dribbling + df_all_col.curve + df_all_col.free_kick_accuracy + df_all_col.long_passing + \
df_all_col.ball_control
df_all_col['total_movement'] = df_all_col.acceleration + df_all_col.sprint_speed + df_all_col.agility + \
df_all_col.reactions + df_all_col.balance
df_all_col['total_power'] = df_all_col.shot_power + df_all_col.jumping + df_all_col.stamina + df_all_col.strength + \
df_all_col.long_shots
df_all_col['total_mentality'] = df_all_col.aggression + df_all_col.interceptions + df_all_col.positioning + \
df_all_col.vision + df_all_col.penalties
df_all_col['total_defending'] = df_all_col.marking + df_all_col.standing_tackle + df_all_col.sliding_tackle
df_all_col['total_goalkeeping'] = df_all_col.gk_diving + df_all_col.gk_handling + df_all_col.gk_kicking + \
df_all_col.gk_positioning + df_all_col.gk_reflexes
print('df_all_col.columns: ', df_all_col.columns) # print column labels for all columns from both tables
print('df_all_col.shape:', df_all_col.shape)
print(df_all_col.info())
# identify non_numeric and numeric columns of interest and create two lists of column labels
non_numeric_col=['player_fifa_api_id', 'player_api_id','player_name', 'birthday', 'date', \
'preferred_foot', 'attacking_work_rate', 'defensive_work_rate']
numeric_col = ['age', 'height', 'weight','overall_rating', 'potential','crossing', 'finishing', \
'heading_accuracy','short_passing', 'volleys', 'dribbling', 'curve', \
'free_kick_accuracy', 'long_passing', 'ball_control', 'acceleration', \
'sprint_speed', 'agility', 'reactions', 'balance', 'shot_power', 'jumping', \
'stamina', 'strength', 'long_shots', 'aggression', 'interceptions', 'positioning',\
'vision', 'penalties', 'marking', 'standing_tackle', 'sliding_tackle', 'gk_diving', \
'gk_handling', 'gk_kicking', 'gk_positioning', 'gk_reflexes']
numeric_few_col = ['age', 'height', 'weight','overall_rating', 'potential', 'total_attack', 'total_skill', 'total_movement', 'total_power', 'total_mentality', 'total_defending', \
'total_goalkeeping']
df_all_col.replace(r'\s+', np.nan, regex=True, inplace = True)
df_all_col.dropna(axis=0, how='any', inplace=True) #drop row (sample) with any NA entry
df_all_col.sort_values('player_name',axis=0, inplace=True)
df_all_col.drop_duplicates(inplace=True)
df_all_col.to_csv('df_all_col.csv')
print ('df_all_col.shape: ', df_all_col.shape)
print(df_all_col.shape)
print(df_all_col.info())
print(df_all_col.head())
print(df_all_col.tail())
print(df_all_col['defensive_work_rate'][0:60]) # need more data cleaning for col before ploting
print ('======')
df_unscaled_data = df_all_col[numeric_col]
print('df_unscaled_data.columns:', df_unscaled_data.columns)
print('df_unscaled_data.shape:', df_unscaled_data.shape)
print('df_unscaled_data.info: ', df_unscaled_data.info())
# Tally similar player attributes into categories for total scores
df_few_col = pd.DataFrame()
df_few_col= df_all_col[numeric_few_col]
print('df_few_col.columns: ', df_few_col.columns)
print('df_few_col.shape:', df_few_col.shape)
print(df_few_col.info())
print(df_few_col.head())
# print scatter matrix
%matplotlib inline
fig = pd.plotting.scatter_matrix(df_few_col, alpha=0.1, figsize=(16, 16), diagonal='kde',range_padding =0.01)
plt.tight_layout()
plt.figure
plt.show()
plt.close()
df_few_col['player_fifa_api_id'] = df_all_col.iloc[:,3]
df_few_col['player_name'] = df_all_col.player_name
print('df_few_col.columns: ', df_few_col.columns)
df_few_col.to_csv('player_grouped_attribute_scores.csv')
print (df_few_col.head())
scaled_data = preprocessing.scale(df_unscaled_data) #center and scale the data
print('scaled data:')
print (scaled_data) # preview scaled data
# create a PCA object.
# sklean uses this PCA object that can be trained using one dataset and applied to another dataset
pca = PCA()
print(type(pca))
# do PCA math, calculate loading scores and the variation each PCA accounts for
pca.fit(scaled_data)
# generate coordinates for a PCA graph based on the loading scores and the scaled data
pca_data = pca.transform(scaled_data)
# pca.explained_variance_ratio_ is <class 'numpy.ndarray'>.
# It calculates the percentage of variance that each principal component accoutns for
per_var = np.round(pca.explained_variance_ratio_*100, decimals =1)
print('=======================')
print('percent of explained variance: ')
print(per_var)
PC_labels = ['PC'+ str(x) for x in range(1,len(per_var)+1)] # labels for the Scree Plot: PC1, PC2 ...
print(' ')
print('=======================')
# create Scree Plot
plt.figure(figsize=(18, 6))
plt.bar(x=range(1, len(per_var)+1), height=per_var, tick_label=PC_labels)
plt.ylabel('Percentage of Explained Variance', fontsize='14')
plt.xlabel('Principal Component', fontsize='14')
plt.title('Scree Plot', fontsize='18')
plt.show()
plt.close()
# put pca_data with DataFrame with PC_labels
pca_df = pd.DataFrame(pca_data, index=None, columns=PC_labels)
print(pca_df.head()) # preview transformed and scaled
print('=======================')
print('Principal Components Scatter Matrix')
df_pc_matrix= pca_df[['PC'+ str(x) for x in range(1,21)]] # scatter matrix for PC1, PC2, ..., PC15
pd.plotting.scatter_matrix(df_pc_matrix, alpha=0.1, figsize=(14, 14), diagonal='kde',range_padding =0.1)
plt.tight_layout()
plt.show()
plt.close()
loading_scores_PC1 = pd.Series(pca.components_[0],index=numeric_col)
loading_scores_PC1_sorted = loading_scores_PC1.abs().sort_values(ascending=False)
print('Sorted PC1 Loading Scores (abs)')
print('PC1 sorted components: ', loading_scores_PC1_sorted.index)
print(loading_scores_PC1_sorted)
print('=======')
loading_scores_PC2 = pd.Series(pca.components_[1],index=numeric_col)
loading_scores_PC2_sorted = loading_scores_PC2.abs().sort_values(ascending=False)
print('Sorted PC2 Loading Scores (abs)')
print('PC2 sorted components: ', loading_scores_PC2_sorted.index)
print(loading_scores_PC2_sorted)
print('=======')
loading_scores_PC3 = pd.Series(pca.components_[2],index=numeric_col)
loading_scores_PC3_sorted = loading_scores_PC3.abs().sort_values(ascending=False)
print('Sorted PC3 Loading Scores (abs)')
print('PC3 sorted components: ', loading_scores_PC3_sorted.index)
print(loading_scores_PC3_sorted)
Note: When plotting PC1 versus PC2 or PC1 versus PC3, two clusters are displayed.
Next, we visualize further in PC1, PC2 and PC3.
# draw PCA 2D plot: PC1 Vs PC2 and PC1 Vs PC3
def color_plot (i):
plt.figure(figsize=(10, 5))
plt.subplot(1,2,1)
plt.scatter(pca_df.PC1, pca_df.PC2, c=scaled_data[:,i], alpha=0.1)
plt.title("PCA Graph: PC1 Versus PC2 - color by " + numeric_col[i], fontsize='12')
plt.xlabel('PC1 - {0}%'.format(per_var[0]), fontsize='12')
plt.ylabel('PC2 - {0}%'.format(per_var[1]), fontsize='12')
plt.tight_layout()
plt.subplot(1,2,2)
plt.scatter(pca_df.PC1, pca_df.PC3, c=scaled_data[:,i], alpha=0.1)
plt.title("PCA Graph: PC1 Versus PC3 - color by " + numeric_col[i], fontsize='12')
plt.xlabel('PC1 - {0}%'.format(per_var[0]), fontsize='12')
plt.ylabel('PC3 - {0}%'.format(per_var[2]), fontsize='12')
plt.tight_layout()
plt.show()
plt.close()
for j in range(0,38):
color_plot (j)
#'attacking_work_rate', 'defensive_work_rate']
#plt.title('players with gk_diving > 40',loc='center')
def plot (df_all, df_sub, hue_col):
# first plot
vis1=sns.lmplot(x='ball_control', y='marking', hue=hue_col, sharex=False, data=df_all, scatter=True, fit_reg=False, units=None, order=1, legend=True)
plt.title('all players')
plt.xlim(0,100)
plt.ylim(0,100)
plt.show()
plt.close()
# second plot: goalkeepers only
vis2=sns.lmplot(x='ball_control', y='marking', hue=hue_col, sharex=False, data=df_sub, scatter=True, fit_reg=False, units=None, order=1, legend=True)
plt.title('players with gk_diving > 40 (goalkeepers)')
plt.xlim(0,100)
plt.ylim(0,100)
plt.show()
plt.close()
print('gk_diving > 40 (goalkeepers)')
df_goalkeepers=df_all_col.loc[df_all_col['gk_diving']>40]
#print(df_goalkeepers.head())
plot(df_all_col, df_goalkeepers, None)
# plot lefty and righty
plot(df_all_col, df_goalkeepers, 'preferred_foot')
# plot lefty only
print('plot preferred left foot')
df1=df_all_col.loc[df_all_col['preferred_foot']=='left']
df2=df_goalkeepers.loc[df_goalkeepers['preferred_foot']=='left']
plot(df1,df2,None)
df1=df_all_col.loc[df_all_col['attacking_work_rate'] != 'y' ]
df1=df1.loc[df1['attacking_work_rate'] !='None']
df2=df_goalkeepers.loc[df_goalkeepers['attacking_work_rate'] != 'y' ]
df2=df2.loc[df2['attacking_work_rate'] !='None']
plot(df1, df2, 'attacking_work_rate')
#plot jointplot with goal keeper attributes:
#'gk_diving', 'gk_handling', 'gk_kicking', 'gk_positioning', 'gk_reflexes'
def joint_plot (df, title) :
vis=sns.jointplot(x='ball_control',y='gk_diving', data=df, xlim=(0,100), ylim=(0,100), stat_func=None)
plt.title(title, loc='left')
plt.show()
plt.close()
vis=sns.jointplot(x='ball_control',y='gk_handling', data=df, xlim=(0,100), ylim=(0,100), stat_func=None)
plt.title(title, loc='left')
plt.show()
plt.close()
vis=sns.jointplot(x='ball_control',y='gk_kicking', data=df, xlim=(0,100), ylim=(0,100), stat_func=None)
plt.title(title, loc='left')
plt.show()
plt.close()
vis=sns.jointplot(x='ball_control',y='gk_positioning', data=df, xlim=(0,100), ylim=(0,100), stat_func=None)
plt.title(title, loc='left')
plt.show()
plt.close()
vis=sns.jointplot(x='ball_control',y='gk_reflexes', data=df, xlim=(0,100), ylim=(0,100), stat_func=None)
plt.title(title, loc='left')
plt.show()
plt.close()
vis=sns.jointplot(x='dribbling',y='gk_diving', data=df, xlim=(0,100), ylim=(0,100), stat_func=None)
plt.title(title, loc='left')
plt.show()
plt.close()
vis=sns.jointplot(x='dribbling',y='gk_handling', data=df, xlim=(0,100), ylim=(0,100), stat_func=None)
plt.title(title, loc='left')
plt.show()
plt.close()
vis=sns.jointplot(x='dribbling',y='gk_kicking', data=df, xlim=(0,100), ylim=(0,100), stat_func=None)
plt.title(title, loc='left')
plt.show()
plt.close()
vis=sns.jointplot(x='dribbling',y='gk_positioning', xlim=(0,100), ylim=(0,100), data=df, stat_func=None)
plt.title(title, loc='left')
plt.show()
plt.close()
vis=sns.jointplot(x='dribbling',y='gk_reflexes', xlim=(0,100), ylim=(0,100), data=df, stat_func=None)
plt.title(title, loc='left')
plt.show()
plt.close()
joint_plot (df_all_col, 'all players')
joint_plot (df_goalkeepers, 'gk_diving > 40 (goalkeepers)')
'''
Interpret correlation coefficient
Exactly –1. A perfect downhill (negative) linear relationship
•–0.70. A strong downhill (negative) linear relationship
•–0.50. A moderate downhill (negative) relationship
•–0.30. A weak downhill (negative) linear relationship
•0. No linear relationship
'''
# save correlation coefficient for dataset to csv
df_corr = df_unscaled_data.corr()
df_corr.to_csv('df_corr.csv')
print('Unscaled Data Scatter Matrix 1')
print('PC1 sorted components: ', loading_scores_PC1_sorted.index[0:19])
#col_of_interest = ['ball_control', 'dribbling', 'short_passing', 'crossing', 'curve','long_shots', 'positioning', 'shot_power', 'vision', 'gk_diving','free_kick_accuracy', 'gk_reflexes', 'gk_handling']
col_of_interest = loading_scores_PC1_sorted.index[0:19]
df_col_of_interest= df_unscaled_data[col_of_interest] # scatter matrix for columns of interest
pd.plotting.scatter_matrix(df_col_of_interest, alpha=0.1, figsize=(16, 16), diagonal='kde',range_padding =0.01)
plt.tight_layout()
plt.show()
plt.close()
print('Unscaled Data Scatter Matrix 2')
print('PC1 sorted components: ', loading_scores_PC1_sorted.index[19:38])
#col_of_interest = ['ball_control', 'dribbling', 'short_passing', 'crossing', 'curve','long_shots', 'positioning', 'shot_power', 'vision', 'gk_diving','free_kick_accuracy', 'gk_reflexes', 'gk_handling']
col_of_interest = loading_scores_PC1_sorted.index[19:38]
df_col_of_interest= df_unscaled_data[col_of_interest] # scatter matrix for columns of interest
pd.plotting.scatter_matrix(df_col_of_interest, alpha=0.1, figsize=(16, 16), diagonal='kde',range_padding =0.01)
plt.tight_layout()
plt.show()
plt.close()
# create distribution plot for all features
final_col = ['player_fifa_api_id','preferred_foot','attacking_work_rate', 'defensive_work_rate'] + numeric_col
print(final_col)
df_final=df_all_col[final_col]
print(df_final.head())
print(len(final_col))
fig = plt.figure(figsize=(24,18))
ax1 = fig.add_subplot(321)
ax2 = fig.add_subplot(322)
ax3 = fig.add_subplot(323)
ax4 = fig.add_subplot(324)
ax5 = fig.add_subplot(325)
vis1=sns.distplot (df_all_col['gk_diving'], bins=30, ax=ax1)
vis2=sns.distplot (df_all_col['gk_handling'], bins=30, ax=ax2)
vis3=sns.distplot (df_all_col['gk_kicking'], bins=30, ax=ax3)
vis4=sns.distplot (df_all_col['gk_positioning'], bins=30, ax=ax4)
vis5=sns.distplot (df_all_col['gk_reflexes'], bins=30, ax=ax5)
plt.show()
plt.close()
for i in range (4,38) :
sns.distplot (df_all_col[final_col[i]], bins=30)
plt.title('Distribution Plot')
plt.show()
plt.close()
The graph of PC1 versus PC2 and the graph of PC1 versus PC3 show that data falls into two distinct clusters. However, after reducing dimensions (players' attributes), information is lost. If considering only one player attribute at a time, only some of the distribution plots disply bimodal distribution, suggesting clustering. One may investigate further with information from the database to find out what contribute into division of the set of players into two clusters.
Additionally, corrections between different play attributes may be investigated.
A closer look at the loading values for PC1 shows that, basically, no skill is much more influential than others.
Sorted Loading Scores for all attributes in PC1 dimension:
ball_control 0.233895
dribbling 0.226609
short_passing 0.220168
crossing 0.213529
curve 0.211442
long_shots 0.211257
positioning 0.204851
shot_power 0.199655
vision 0.197614
gk_diving 0.196600
free_kick_accuracy 0.196366
gk_reflexes 0.195234
gk_handling 0.195011
gk_positioning 0.193662
volleys 0.193077
finishing 0.191773
acceleration 0.185480
penalties 0.183370
sprint_speed 0.182038
long_passing 0.177636
agility 0.168734
stamina 0.158548
balance 0.153811
height 0.134852
heading_accuracy 0.132158
weight 0.125473
gk_kicking 0.121982
reactions 0.105162
aggression 0.087644
overall_rating 0.083033
potential 0.080174
interceptions 0.071469
standing_tackle 0.065230
sliding_tackle 0.059237
marking 0.052163
strength 0.028138
age 0.005573
jumping 0.001561
Obvisouly, all the skills are unique and require related special training. Due to the roles of goalkeepers, their training is focused in certain attritbutes: 'gk_diving', 'gk_handling', 'gk_kicking', 'gk_positioning', 'gk_reflexes'. High scores in these attributes, except ## type list here !! based on the correlation coefficients saved in the csv file, set these players apart from other players and result in themselves as one cluster. In some ocassions, these attributess have high correlate with other attributes, such as ....
More to come ...